# Calculate/load Differential Expression metrics for all genes, load SFARI dataset:

# Gene expression data
load('./../working_data/RNAseq_ASD_4region_normalized_vst.Rdata')

# SFARI genes
SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv')
SFARI_genes = SFARI_genes %>% inner_join(datProbes, by=c('gene-symbol'='external_gene_id')) %>%
              mutate('ID' = ensembl_gene_id) %>%
              dplyr::select(ID, `gene-score`, syndromic)

# Balance Groups by covariates, remove singular batches (none)
to_keep = (datMeta$Subject_ID != 'AN03345')
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
  
if(!file.exists('./../working_data/genes_ASD_DE_info_vst.csv')) {
  
  # Calculate differential expression for ASD
  mod = model.matrix(~ Dx, data=datMeta)
  corfit = duplicateCorrelation(datExpr, mod, block=datMeta$Subject_ID)
  lmfit = lmFit(datExpr, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)
  
  fit = eBayes(lmfit, trend=T, robust=T)
  top_genes = topTable(fit, coef=2, number=nrow(datExpr))
  genes_DE_info = top_genes[match(rownames(datExpr), rownames(top_genes)),] %>%
                  mutate('ID'=rownames(datExpr)) %>% left_join(SFARI_genes, by='ID')
  
  write_csv(genes_DE_info, path='./../working_data/genes_ASD_DE_info_vst.csv')
  
  rm(mod, corfit, lmfit, fit, top_genes)
} else {
genes_DE_info = read_csv('./../working_data/genes_ASD_DE_info_vst.csv')
}

genes_DE_info = genes_DE_info %>% dplyr::select(ID, logFC, AveExpr, t, P.Value, adj.P.Val, 
                                                B, `gene-score`, syndromic) %>%
                mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`))


datExpr = datExpr %>% data.frame
datExpr_backup = datExpr


rm(to_keep, datProbes)

Raw data

datExpr_raw = read.csv('./../raw_data/RNAseq_ASD_datExpr.csv', row.names=1)
datExpr_raw = datExpr_raw[substr(rownames(datExpr_raw),1,3)=='ENS',
                          substring(colnames(datExpr_raw),2) %in% datMeta$Dissected_Sample_ID]
datExpr_raw = datExpr_raw[rowSums(datExpr_raw)>5,]

datExpr_ASD = datExpr_raw %>% dplyr::select(which(datMeta$Diagnosis_=='ASD')) %>% rowMeans
datExpr_CTL = datExpr_raw %>% dplyr::select(which(datMeta$Diagnosis_!='ASD')) %>% rowMeans

ASD_vs_CTL = data.frame('ID'=rownames(datExpr_raw), 'ASD'=datExpr_ASD+1, 'CTL'=datExpr_CTL+1) %>%
             left_join(SFARI_genes, by='ID') %>% mutate(`gene-score`=as.factor(`gene-score`))

p1 = ggplotly(ASD_vs_CTL %>% ggplot(aes(x=ASD, y=CTL, color=`gene-score`)) + 
              scale_x_continuous(trans='log2') + scale_y_continuous(trans='log2') + geom_point(alpha=0.4) + 
              geom_smooth(method=lm, se=FALSE, color='gray') +
              ggtitle('Mean expression by gene ASD vs CTL') + theme_minimal())

p2 = ggplotly(ASD_vs_CTL %>% dplyr::select(ID, ASD, CTL) %>% melt %>% 
              ggplot(aes(variable, value, fill=variable)) + geom_boxplot() + 
                     coord_cartesian(ylim = c(0, 500)) + theme_minimal())

subplot(p1, p2, nrows=1)
remove(p1, p2, ASD_vs_CTL)

VST normalised data

datExpr_ASD = datExpr %>% dplyr::select(which(datMeta$Diagnosis_=='ASD')) %>% rowMeans
datExpr_CTL = datExpr %>% dplyr::select(which(datMeta$Diagnosis_!='ASD')) %>% rowMeans

ASD_vs_CTL = data.frame('ID'=rownames(datExpr), 'ASD'=datExpr_ASD+1, 'CTL'=datExpr_CTL+1) %>%
             left_join(SFARI_genes, by='ID') %>% mutate(`gene-score`=as.factor(`gene-score`))

p1 = ggplotly(ASD_vs_CTL %>% ggplot(aes(x=ASD, y=CTL, color=`gene-score`)) + 
              scale_x_continuous(trans='log2') + scale_y_continuous(trans='log2') + geom_point(alpha=0.4) + 
              geom_smooth(method=lm, se=FALSE, color='gray') +
              ggtitle('Mean expression by gene ASD vs CTL') + theme_minimal())

p2 = ggplotly(ASD_vs_CTL %>% dplyr::select(ID, ASD, CTL) %>% melt %>% 
              ggplot(aes(variable, value, fill=variable)) + geom_boxplot() + theme_minimal())

subplot(p1, p2, nrows=1)
remove(p1, p2, asd_ctl_mean)

By Brain Region

scatter_plot_by_lobe = function(lobe){
  
  lobe_samples = as.numeric(datMeta$Brain_lobe==lobe)

  datExpr_ASD = datExpr %>% dplyr::select(which(datMeta$Diagnosis_=='ASD' & datMeta$Brain_lobe==lobe)) %>% rowMeans
  datExpr_CTL = datExpr %>% dplyr::select(which(datMeta$Diagnosis_!='ASD' & datMeta$Brain_lobe==lobe)) %>% rowMeans
  
  ASD_vs_CTL = data.frame('ID'=rownames(datExpr), 'ASD'=datExpr_ASD+1, 'CTL'=datExpr_CTL+1) %>%
             left_join(SFARI_genes, by='ID') %>% mutate(`gene-score`=as.factor(`gene-score`))

  p1 = ASD_vs_CTL %>% ggplot(aes(x=ASD, y=CTL, color=`gene-score`)) + 
       scale_x_continuous(trans='log2') + scale_y_continuous(trans='log2') + geom_point(alpha=0.4) + 
       geom_smooth(method=lm, se=FALSE, color='gray') +
       ggtitle(glue('Mean expression by gene ASD vs CTL in ',lobe, ' lobe')) + theme_minimal()
  
  p2 = ASD_vs_CTL %>% dplyr::select(ID, ASD, CTL) %>% melt %>% 
       ggplot(aes(variable, value, fill=variable)) + geom_boxplot() + theme_minimal()
  
  return(list(p1,p2))
}

ps = list()
for(lobe in unique(datMeta$Brain_lobe)){
  ps[[lobe]] = scatter_plot_by_lobe(lobe)
}

require(grid.arrange)
grid.arrange(ps[['Frontal']][[1]],   ps[['Frontal']][[2]],
             ps[['Temporal']][[1]],  ps[['Temporal']][[2]],
             ps[['Parietal']][[1]],  ps[['Parietal']][[2]],
             ps[['Occipital']][[1]], ps[['Occipital']][[2]], nrow=4)